ChIP AML PiPeline v2

In [1]:
import os
import pandas as pd
import sys
sys.path.insert(0, '../..')
import itertools
from scipy import stats
import numpy as np

from JKBio.epigenetics import chipseq as chip
from JKBio.utils import helper
from JKBio.utils import plot
import igv
import SimpSOM as sps
from scipy import stats

import seaborn as sns
from matplotlib import cm
from matplotlib import pyplot as plt
from IPython.display import IFrame
import seaborn as sns
from bokeh.plotting import *
import igv

import numba
from numba import jit

from scipy.cluster.hierarchy import linkage, leaves_list
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from IPython.display import IFrame

from statsmodels.stats.multitest import multipletests

from pybedtools import BedTool
import pyBigWig

output_notebook()
%load_ext autoreload
%autoreload 2
Loading BokehJS ...
In [21]:
#setting basic parameters
project="Cobinding_ChIP"
version="v3"
merging_version="motifs"
window="150"
In [32]:
#retrieving the data from the previous notebook
%store -r mergedmot
%store -r cols
%store -r annot
%store -r version
%store -r window
%store -r crc
merging_version
Out[32]:
'motifs'

Correlation with Annotations

non binarized

In [35]:
for val in mergedmot.columns[72:]:
    mergedmot.loc[mergedmot[mergedmot[val]==0].index,val.split('_')[0]]=0
merged = mergedmot[mergedmot.columns[:72]].rename(columns={val.split('_')[0]: val.split('_')[0]+"_filtered" for val in mergedmot.columns[72:]})
In [41]:
#data = pd.DataFrame(np.corrcoef(stats.zscore(0.01+merged[merged.columns[cols:]].T)), columns=merged.columns[cols:], index = merged.columns[cols:])
data = pd.DataFrame(np.corrcoef(merged[merged.columns[cols:]].T.astype(bool)), columns=merged.columns[cols:], index = merged.columns[cols:])
link = linkage(data[:annot-cols])
col = data.iloc[annot-cols:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
rdb = cm.get_cmap('RdBu_r', 256)
for val in col.columns:
    a = [rdb(128+int(v*128)) for v in col[val]]
    col[val] = a
In [42]:
#correlation_withannotation
fig = sns.clustermap(data.iloc[:annot][data.columns[leaves_list(link)]], vmin=-1,vmax=1, col_colors=col.T, figsize=(10,20), cmap='RdBu_r', row_linkage=link, col_cluster=False, colors_ratio=0.01)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_withannotation")
#fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_with_annotation.pdf')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_boolcorrelation_with_annotation.pdf')

plt.show()

binarized

In [43]:
#data = pd.DataFrame(np.corrcoef(stats.zscore(0.01+merged[merged.columns[cols:]].T)), columns=merged.columns[cols:], index = merged.columns[cols:])
data = pd.DataFrame(np.corrcoef(merged[merged.columns[cols:]].T.astype(bool)), columns=merged.columns[cols:], index = merged.columns[cols:])
link = linkage(data[:annot-cols])
col = data.iloc[annot-cols:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
rdb = cm.get_cmap('RdBu_r', 256)
for val in col.columns:
    a = [rdb(128+int(v*128)) for v in col[val]]
    col[val] = a
In [44]:
#correlation_withannotation
fig = sns.clustermap(data.iloc[:annot][data.columns[leaves_list(link)]], vmin=-1,vmax=1, col_colors=col.T, figsize=(10,15), cmap='RdBu_r', row_linkage=link, col_cluster=False, colors_ratio=0.008)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_onbinarized_signal_withannotation")
#fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_with_annotation.pdf')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_boolcorrelation_onbinarized_signal_with_annotation.pdf')

plt.show()

Looking at peak overlap

How many of peak in A (column) overlaps with peak in B (rows)

in other words:

what is the percentage of B's peaks that are overlaped by A's peaks

In [45]:
overlap, correlation, pvals, enrichment = chip.pairwiseOverlap(merged, norm=True)
we will be correcting for fully similar lines/ columns by removing 1 on their last value
/home/jeremie/.local/lib/python3.8/site-packages/scipy/stats/stats.py:2419: RuntimeWarning: invalid value encountered in true_divide
  return (a - mns) / sstd
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2534: RuntimeWarning: invalid value encountered in true_divide
  c /= stddev[:, None]
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2535: RuntimeWarning: invalid value encountered in true_divide
  c /= stddev[None, :]
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2526: RuntimeWarning: Degrees of freedom <= 0 for slice
  c = cov(x, y, rowvar)
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2455: RuntimeWarning: divide by zero encountered in true_divide
  c *= np.true_divide(1, fact)
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2455: RuntimeWarning: invalid value encountered in multiply
  c *= np.true_divide(1, fact)
/home/jeremie/.local/lib/python3.8/site-packages/scipy/stats/stats.py:2416: RuntimeWarning: Mean of empty slice.
  mns = a.mean(axis=axis, keepdims=True)
/home/jeremie/.local/lib/python3.8/site-packages/numpy/core/_methods.py:153: RuntimeWarning: invalid value encountered in true_divide
  ret = um.true_divide(
/home/jeremie/.local/lib/python3.8/site-packages/numpy/core/_methods.py:216: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/jeremie/.local/lib/python3.8/site-packages/numpy/core/_methods.py:185: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/jeremie/.local/lib/python3.8/site-packages/numpy/core/_methods.py:206: RuntimeWarning: invalid value encountered in true_divide
  ret = um.true_divide(
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:393: RuntimeWarning: Mean of empty slice.
  avg = a.mean(axis)
../../JKBio/epigenetics/chipseq.py:1103: RuntimeWarning: divide by zero encountered in log2
  enrichment[i,j+add] = np.log2(e)
In [46]:
#saving the enrichments
overlap.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_overlap.csv')
correlation.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_correlation_on_overlap.csv')
enrichment.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_enrichment_on_overlap.csv')
pvals.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_enrichment_on_overlap_pvals.csv')

pairwise overlap

In [47]:
data = pd.DataFrame(data=overlap,index=merged.columns[cols:], columns=merged.columns[cols:])
link = linkage(data.iloc[:annot-cols]) # D being the measurement
c = data[[co for co in data.columns if co not in data.iloc[:annot-cols].index.tolist()]]
viridis = cm.get_cmap('viridis', 256)
for val in c.columns:
    a = [viridis(int(v*255)) for v in c[val]]
    c[val] = a
<ipython-input-47-a4be739a8716>:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c[val] = a
In [48]:
fig = sns.clustermap(data.iloc[:annot-cols][data.columns[np.concatenate((leaves_list(link), range(annot,len(data.columns))))]], row_linkage=link, col_colors=c,figsize=(20,18), colors_ratio=0.01, col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("pairwise_overlap_clustermap")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'pairwise_overlap_clustermap.pdf')
plt.show()

Correlation on overlaps

on the overlaps given above

In [49]:
data = pd.DataFrame(data = correlation, index = merged.columns[cols:], columns = merged.columns[cols:])
link = linkage(data.iloc[:annot-cols], optimal_ordering=True) # D being the measurement
c = data[[co for co in data.columns if co not in data.iloc[:annot-cols].index.tolist()]]
viridis = cm.get_cmap('viridis', 256)
for val in c.columns:
    a = [rdb(128+int(v*128)) for v in c[val]]
    c[val] = a
<ipython-input-49-63360ec74203>:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c[val] = a
In [50]:
fig = sns.clustermap(data.iloc[:annot-cols][data.columns[np.concatenate((leaves_list(link), range(annot,len(data.columns))))]], row_linkage=link, col_colors=c, colors_ratio=0.01, figsize=(20,18), vmin=-1, vmax=1, cmap='RdBu_r', col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_on_pairwise_overlap_clustermap")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_onoverlap.pdf')
plt.show()

Enrichment

In [51]:
link = linkage(enrichment, optimal_ordering=True)
fig = sns.clustermap(enrichment,figsize=(20,20), vmin=-5,vmax=5, cmap='RdBu_r', col_linkage=link, row_linkage=link)
fig.ax_col_dendrogram.set_visible(False)
fig.ax_row_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_enrichment_clustermap_all_to_all.pdf')
plt.show()
In [52]:
enrichment.loc[enrichment.columns[leaves_list(link)]][enrichment.columns[leaves_list(link)]].values.max()
Out[52]:
1000.0
In [53]:
link = linkage(enrichment, optimal_ordering=True)
plot.correlationMatrix(enrichment.loc[enrichment.columns[leaves_list(link)]][enrichment.columns[leaves_list(link)]].values, dataIsCorr=True, names=enrichment.columns[leaves_list(link)].tolist(), pvals=pvals[enrichment.columns[leaves_list(link)]].loc[enrichment.columns[leaves_list(link)]].values, size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/',interactive=True,minval=-4,maxval=4)
we are assuming you want to display the pvals of your correlation with size
1000.0
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:125: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:138: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")
Out[53]:
Figure(
id = '1206', …)

looking only at crcs

In [54]:
enrichment = enrichment[crc].loc[crc]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-54-63c65a2fadc3> in <module>
----> 1 enrichment = enrichment[crc].loc[crc]

~/.local/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2804             if is_iterator(key):
   2805                 key = list(key)
-> 2806             indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
   2807 
   2808         # take() does not accept boolean indexers

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1550             keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
   1551 
-> 1552         self._validate_read_indexer(
   1553             keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
   1554         )

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1644             if not (self.name == "loc" and not raise_missing):
   1645                 not_found = list(set(key) - set(ax))
-> 1646                 raise KeyError(f"{not_found} not in index")
   1647 
   1648             # we skip the warning on Categorical/Interval

KeyError: "['STAT5B', 'FLI1', 'MYC', 'RUNX1', 'FOSL2', 'LYL1', 'RXRA', 'IRF8', 'MYB', 'GATA2', 'ZFPM1', 'CEBPA', 'MEF2C', 'ZEB2', 'SREBF1', 'RUNX2', 'MEIS1', 'SP1', 'TFAP4', 'ETV6', 'E2F3', 'ZNF281', 'MAX', 'HOXA9', 'SPI1'] not in index"
In [ ]:
link = linkage(enrichment, optimal_ordering=True)
plot.correlationMatrix(enrichment.loc[enrichment.columns[leaves_list(link)]][enrichment.columns[leaves_list(link)]].values, dataIsCorr=True, names=enrichment.columns[leaves_list(link)].tolist(), pvals=pvals[enrichment.columns[leaves_list(link)]].loc[enrichment.columns[leaves_list(link)]].values, size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/',interactive=True,minval=-4,maxval=4)

clustering

I have tried gaussian mixtures and Agglomerative clustering algorithm. Only the second can create a hierarchical clustering.

It seems that gaussian mixture makes more sense given the data we have, for now, is more "homogeneous".

I am still not so happy with the clustering. It can be because of the how much importance, outlier values and the high number of noisy values from locations with no peaks.

We can use similar methods to RNAseq to improve this (clamping values, log transform, first round of PCA..)

using KMeans

In [96]:
data= merged[merged.columns[cols:annot]].values
# using score
scaled_data = stats.zscore(data)
# using regular scaling
#scaled_data = (data-data.min(0))/(data.max(0)-data.min(0))
In [56]:
data.max(0)
Out[56]:
array([ 107.04528   ,   36.84465   , 1424.57      ,   98.294     ,
         80.06743333,   60.1936    ,   82.7514    ,   85.43705   ,
         95.6669    ,  512.89      ,  191.227     ,  684.427     ,
        598.145     , 3947.19      ,  645.24      ,   43.670685  ,
        293.239     ,   45.17605   ,   67.9814    ,   79.553     ,
         79.09065   ,  811.304     ,   75.82466667,  110.47425   ,
        259.97310333,  125.9535    ,  812.18893333, 1240.17      ,
         88.84995   ])
In [97]:
#https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
n = 1
n_clust=[10,20,50,100,300]
kmean = KMeans(n_clusters=n_clust[n],n_jobs=8)
groups = kmean.fit_predict(scaled_data)
centroid = kmean.cluster_centers_

plotting center location of clusters (kind of enrichment)

In [116]:
 
Out[116]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
CEBPA_filtered 0.055835 -0.072791 0.073742 -0.104058 -0.104058 0.047216 0.157092 1.000000 -0.104058 -0.042501 -0.063077 -0.049214 -0.104058 0.075453 -0.104058 -0.050383 -0.104058 0.058827 -0.037007 -0.006478
HOXA9_filtered -0.001985 -0.003826 -0.000511 -0.003857 -0.003857 -0.003857 1.000000 -0.003407 -0.003857 -0.003857 -0.003639 -0.003337 -0.003857 -0.003857 -0.003857 -0.003857 -0.003857 -0.002852 -0.003757 -0.003623
PLAGL2 0.001377 -0.000509 0.001996 0.006459 0.004816 0.006235 -0.000208 -0.000483 0.000315 0.003812 0.003356 0.000025 0.004501 0.001093 0.008269 0.002536 1.000000 -0.000126 -0.000174 0.000197
MEIS1_filtered 0.175872 -0.021344 0.150142 -0.070230 0.104469 0.085363 1.000000 0.012641 -0.033178 0.023160 0.023175 0.059576 0.048550 0.026101 -0.070230 0.025371 -0.070230 0.052460 0.019068 0.095032
ETV6_filtered 0.076203 -0.014919 0.039121 -0.016875 0.397742 1.000000 0.020041 -0.011439 -0.016875 0.028140 0.010448 -0.005342 -0.016875 0.044385 -0.016875 -0.016875 -0.016875 0.014488 -0.000983 0.008658
HEX -0.005176 -0.001445 -0.005176 1.000000 -0.005176 -0.005176 -0.004651 -0.005152 -0.005176 -0.005176 -0.004992 -0.005176 -0.005176 -0.005176 -0.005176 -0.005176 -0.005176 -0.005176 -0.005176 -0.005176
FLAG_GFI1 0.128462 -0.035232 0.096588 1.000000 -0.099662 0.143110 0.109225 0.030552 0.309602 0.060888 0.048461 0.021553 -0.099662 0.053250 0.200470 0.037086 0.245387 0.050571 0.032792 0.243738
RUNX1_filtered 0.519447 -0.085966 0.305377 -0.146676 0.735915 0.508587 0.190537 -0.045041 -0.063925 0.114325 0.015250 0.020682 -0.146676 0.273342 0.746543 -0.096181 -0.146676 0.122856 -0.014618 1.000000
FOSL2_filtered 0.038114 -0.021058 0.020094 -0.023834 -0.023834 0.028901 0.051856 -0.017136 0.051849 -0.004661 -0.014549 1.000000 -0.023834 0.052057 -0.023834 -0.023834 -0.023834 -0.008381 -0.013379 -0.004198
FLI1_filtered 0.315824 -0.026663 0.092782 -0.045324 -0.047062 1.000000 0.031904 -0.019915 0.014653 0.107723 0.087069 -0.003164 0.294033 0.050095 -0.047062 0.078482 0.180073 0.019976 -0.004528 0.027806
GATA2_filtered 0.048894 -0.006169 0.012414 -0.011352 -0.011352 0.003092 0.007473 -0.002341 -0.003643 -0.005669 -0.004027 -0.003709 -0.011352 0.003607 -0.011352 -0.011352 -0.011352 1.000000 -0.000672 0.002572
SP1_filtered -0.018321 -0.063751 0.223716 -0.066775 0.710958 0.218519 -0.037870 -0.069968 0.389537 0.827408 0.542291 -0.058661 0.309757 0.035463 0.730593 -0.055674 1.000000 -0.042815 -0.068272 -0.043605
E2F3_filtered 0.000178 -0.001353 0.006612 -0.001542 1.000000 0.007135 -0.000473 -0.001384 0.000933 0.015753 0.010838 -0.000697 0.134227 0.002040 -0.001597 -0.001597 0.019462 -0.000395 -0.001176 -0.000666
ZNF281_filtered 0.000841 -0.001011 0.004300 -0.000399 0.154444 0.003824 -0.000679 -0.001409 1.000000 0.006651 0.005319 -0.000588 0.004127 0.000380 -0.001784 -0.001413 0.408849 -0.000808 -0.000811 -0.000486
SREBF1_filtered 0.000509 -0.000160 0.001001 0.000485 1.000000 0.001171 0.000114 -0.000121 -0.000178 0.001822 0.000828 0.000018 -0.000178 0.000448 -0.000178 -0.000178 -0.000178 0.000077 -0.000073 0.000007
LMO2 0.743586 -0.085306 0.496944 -0.100767 1.000000 0.509548 0.051991 -0.061604 -0.088521 0.629160 0.646750 -0.024501 0.638202 0.154959 -0.102080 -0.102080 0.663143 0.032642 -0.032194 0.027537
STAT5B_filtered 0.007557 -0.003225 0.010416 -0.003521 -0.006039 0.012772 0.002105 -0.001180 -0.004377 0.005666 0.005244 0.001443 0.092700 1.000000 0.054128 -0.003588 0.538885 0.007196 0.004280 0.004629
ZMYND8 0.266077 -0.073317 0.338290 -0.080148 0.426030 0.585420 0.146582 -0.027584 0.014065 0.442742 0.395716 0.032487 0.485112 0.217075 1.000000 0.016867 0.995572 0.040075 0.031826 0.188076
FLAG_MEF2C 0.002685 -0.003611 0.007411 1.000000 -0.007328 0.013438 -0.002897 -0.005599 0.065686 0.008816 0.006901 -0.003691 -0.007328 0.001516 -0.007328 -0.007328 0.212755 -0.003210 -0.002912 -0.002670
LYL1_filtered 1.000000 -0.010683 0.056416 -0.015029 -0.015029 0.061936 0.022067 -0.006742 -0.015029 0.003561 -0.000974 0.007939 -0.015029 0.039367 -0.015029 -0.015029 -0.015029 0.036829 0.000025 0.028241
MYC_filtered 0.072771 -0.025280 0.087808 -0.050655 -0.050655 0.083464 0.000328 -0.024780 0.046388 0.678872 0.011951 -0.007763 1.000000 0.031772 -0.050655 0.051756 0.446820 0.001524 -0.011571 0.009378
TFAP4_filtered 0.178572 -0.009482 0.056615 -0.020775 1.000000 0.053905 0.005742 -0.005387 0.003563 0.047609 0.036689 0.046237 -0.020989 0.044125 -0.020989 -0.004891 0.634963 0.002900 0.002126 0.022744
RUNX2_filtered 0.344206 -0.060332 0.242739 -0.108636 0.876062 0.332049 0.180724 -0.019592 -0.053542 0.121602 0.043967 0.020425 0.885329 0.179832 -0.111071 -0.084904 1.000000 0.088298 0.008310 0.617005
MAX_filtered 0.103121 -0.039940 0.138435 -0.064331 -0.066058 0.160225 0.006066 -0.040127 0.196244 1.000000 0.058335 -0.013675 0.733493 0.063575 0.201763 0.026067 0.723701 -0.002791 -0.025342 0.006784
IRF8_filtered 0.058281 -0.032604 0.024648 -0.058701 0.020591 0.065771 0.064737 -0.034352 -0.036860 -0.027082 -0.028587 -0.030761 -0.058960 0.045510 -0.058960 -0.058960 -0.058960 0.051466 1.000000 -0.011040
MYB_filtered 0.114752 -0.014909 1.000000 -0.021643 0.167876 0.051691 0.032883 -0.007667 -0.002085 0.043827 0.029216 0.000429 -0.021643 0.036711 0.194298 -0.004262 -0.021643 0.010819 -0.000835 0.016170
ZEB2_filtered 0.000727 -0.000208 0.000912 -0.000374 0.009508 0.001054 0.000059 -0.000046 0.000521 0.001189 0.000956 -0.000002 -0.000374 0.000300 1.000000 0.304123 0.010944 0.000017 -0.000001 0.000170
RXRA_filtered 0.002265 -0.000325 0.001868 -0.000024 0.028578 0.002321 0.000049 -0.000173 0.000327 0.002207 0.001725 0.000403 1.000000 0.001802 -0.000502 -0.000261 0.014969 0.000275 0.000017 0.000504
FLAG_MEF2D 0.112760 -0.040471 0.199126 -0.044870 0.296179 0.204750 0.089104 0.007856 0.574853 0.230889 0.258644 -0.015255 0.401014 0.108152 0.241751 -0.038352 1.000000 0.048114 0.016370 0.067318
In [111]:
df/
Out[111]:
0 1 2 3 4 5 6 7 8 9 ... RUNX1_filtered RUNX2_filtered RXRA_filtered SP1_filtered SREBF1_filtered STAT5B_filtered TFAP4_filtered ZEB2_filtered ZMYND8 ZNF281_filtered
CEBPA_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
HOXA9_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
PLAGL2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
MEIS1_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ETV6_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
HEX NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
FLAG_GFI1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
RUNX1_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
FOSL2_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
FLI1_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
GATA2_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
SP1_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
E2F3_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ZNF281_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
SREBF1_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
LMO2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
STAT5B_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ZMYND8 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
FLAG_MEF2C NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
LYL1_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
MYC_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
TFAP4_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
RUNX2_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
MAX_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
IRF8_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
MYB_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ZEB2_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
RXRA_filtered NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
FLAG_MEF2D NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

29 rows × 49 columns

In [109]:
df = pd.DataFrame(centroid,columns=merged.columns[cols:annot]).T
In [118]:
df = pd.DataFrame(centroid,columns=merged.columns[cols:annot]).T
#df = (df.T/df.max(1)).T
fig = sns.clustermap(df, vmax=1)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'_centroids.pdf')

showing the cobinding matrix with the clusters

In [61]:
merged = merged.reset_index(drop=True)
In [119]:
rand = np.random.choice(merged.index,5000)

subgroups = groups[rand]
sorting = np.argsort(subgroups)
rainb = cm.get_cmap('gist_rainbow', len(set(groups)))
colors = [rainb(i) for i in subgroups[sorting]]

viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[annot:]]
for val in data.columns:
    data[val]=np.log2(1+data[val])
data = (data-data.min())/(data.max()-data.min())
data = data.iloc[rand].iloc[sorting]
for val in data.columns:
    a = [viridis(int(v*256)) for v in data[val]]
    data[val] = a
data["clusters"]  = colors
<ipython-input-119-770781c76b2b>:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[val]=np.log2(1+data[val])
In [120]:
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[cols:annot]].iloc[rand].iloc[sorting].T), vmin=0, vmax=3, figsize=(20,20), z_score=0, colors_ratio=0.01, col_cluster=False,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("sorted clustermap of cobindings clustered with Kmeans")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'_clustermap_cobinding.pdf')
plt.show()

Computing enrichment on the clusters

In [121]:
enr, pvals, _ = chip.enrichment(merged, groups=groups)
../../JKBio/epigenetics/chipseq.py:1168: RuntimeWarning: divide by zero encountered in log2
  groups), columns=bedfile.columns[bedcol:])
In [122]:
# enrichment for each cluster
fig = sns.clustermap(enr.T,figsize=(14,20), col_cluster=0, vmax=4, vmin=-4, cmap='RdBu_r')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'enrichment_on_cluster_metasubset_zcorescale.pdf')
plt.show()

using bokeh

pvalue will be informed by opacity

In [67]:
fig = plot.correlationMatrix(enr.loc[enr.columns[leaves_list(link)]][enr.columns[leaves_list(link)]].values, maxokpval=10**-9,  pvals= pvals[enr.columns[leaves_list(link)]].loc[enr.columns[leaves_list(link)]], names=enr.columns[leaves_list(link)].tolist(), size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/', interactive=True, minval=-4, maxval=4, dataIsCorr=True)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-67-6ac8f1585189> in <module>
----> 1 fig = plot.correlationMatrix(enr.loc[enr.columns[leaves_list(link)]][enr.columns[leaves_list(link)]].values, maxokpval=10**-9,  pvals= pvals[enr.columns[leaves_list(link)]].loc[enr.columns[leaves_list(link)]], names=enr.columns[leaves_list(link)].tolist(), size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/', interactive=True, minval=-4, maxval=4, dataIsCorr=True)

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1766 
   1767             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1768             return self._getitem_axis(maybe_callable, axis=axis)
   1769 
   1770     def _is_scalar_access(self, key: Tuple):

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1952                     raise ValueError("Cannot index with multidimensional key")
   1953 
-> 1954                 return self._getitem_iterable(key, axis=axis)
   1955 
   1956             # nested tuple slicing

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis)
   1593         else:
   1594             # A collection of keys
-> 1595             keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
   1596             return self.obj._reindex_with_indexers(
   1597                 {axis: [keyarr, indexer]}, copy=True, allow_dups=True

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1550             keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
   1551 
-> 1552         self._validate_read_indexer(
   1553             keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
   1554         )

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1638             if missing == len(indexer):
   1639                 axis_name = self.obj._get_axis_name(axis)
-> 1640                 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   1641 
   1642             # We (temporarily) allow for some missing keys with .loc, except in

KeyError: "None of [Index(['regular_enhancer', 'FOSL2_filtered', 'HOXA9_filtered',\n       'super_enhancer', 'promoters', 'HEX', 'MYBL2', 'AFF4', 'H3K36me3',\n       'JUND_filtered', 'ATAC', 'H3K4me3', 'POLII', 'FLAG_GFI1', 'H3K36me2',\n       'H3K4me1', 'MLL_KTM2A', 'GATA2_filtered', 'IRF8_filtered',\n       'CEBPB_filtered', 'LYL1_filtered', 'MEIS1_filtered', 'RUNX1_filtered',\n       'RUNX2_filtered', 'FOXP1_filtered', 'STAT5B_filtered', 'MYC_filtered',\n       'FLI1_filtered', 'TFAP4_filtered', 'MYB_filtered', 'IKZF1_filtered',\n       'ZEB2_filtered', 'ELF2_filtered', 'MAX_filtered', 'ZNF281_filtered',\n       'ETV6_filtered', 'RARA_filtered', 'RXRA_filtered', 'E2F3_filtered',\n       'SP1_filtered', 'H3K79me2', 'PSER2', 'LMO2', 'FLAG_IRF2BP2', 'ZMYND8',\n       'H3K27ac', 'BRD4', 'FLAG_MEF2D', 'CDK9', 'MED1', 'PLAGL2',\n       'SREBF1_filtered', 'FLAG_SPI1', 'CEBPA_filtered', 'CTCF_filtered',\n       'FLAG_MEF2C', 'SMC1', 'H3K9ac', 'H3K27me3', 'repression', 'WDR5',\n       'activation', 'CDK13', 'LDB1'],\n      dtype='object')] are in the [index]"

showing enrichment over cobinding

In [68]:
plot.andrew(groups, merged, annot, cols=8, precise=False, folder = '../results/' + project + '/plots/' + version + '_' + merging_version + '_' + window + '_kmeans_', title = "sorted clustermap of cobindings clustered with Kmeans")
../../JKBio/epigenetics/chipseq.py:1146: RuntimeWarning: divide by zero encountered in log2
  enrichment[i, j] = np.log2(e)
../../JKBio/utils/plot.py:580: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subenr[subenr>rangeval]=rangeval
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2986: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
../../JKBio/utils/plot.py:581: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subenr[subenr<-rangeval]=-rangeval

Using SOMs

In [69]:
#Import the library
size = 20
#Build a network 20x20 with a weights format taken from the raw_data and activate Periodic Boundary Conditions. 
net = sps.somNet(size,size, scaled_data, PBC=True)

#Train the network for 10000 epochs and with initial learning rate of 0.01. 
net.train(0.01, 10000)

#Save the weights to file
net.save('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'_cobinding_SOMweights_'+str(size))
Periodic Boundary Conditions active.
The weights will be initialised randomly.
Training SOM... done!
In [712]:
net = sps.somNet(0,0, scaled_data, loadFile='../results/'+project+'/'+version+'_'+merging_version+'_'+window+'_cobinding_SOMweights_'+str(size), PBC=True)
Periodic Boundary Conditions active.
The weights will be loaded from file.
In [70]:
pathplot = "../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_somNets/"
! mkdir $pathplot
In [86]:
#Print a map of the network nodes and colour them according to the first feature (column number 0) of the dataset
#and then according to the distance between each node and its neighbours.
for col in range(scaled_data.shape[1]):
    net.nodes_graph(colnum=col, printout=True, show=True, path="../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_somNets/", colname=data.columns[col])
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
In [87]:
diffs = net.diff_graph(show=False, returns=True)
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_cobinding_SOM_'+str(size)+'.pdf')
In [89]:
plot.SOMPlot(net, size=size, colnames=merged.columns[cols:annot], folder='../results/' + project + '/plots/' + version + '_' + merging_version + '_'+window, minweight=0.05)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-89-214f8280beb5> in <module>
----> 1 plot.SOMPlot(net, size=size, colnames=merged.columns[cols:annot], folder='../results/' + project + '/plots/' + version + '_' + merging_version + '_'+window, minweight=0.05)

~/JKBio/utils/plot.py in SOMPlot(net, size, colnames, minweight, distq1, distq2, distr, folder)
    567         somnodes.loc[i, 'features'] = tot
    568     #interactive SOM with features with highest importance to the nodes, displayed when hovering
--> 569     bigScatter(somnodes, precomputed=True, features=True, binsize=1, title='Cobinding SOM cluster of '+str(size), folder=folder)
    570 
    571 

~/JKBio/utils/plot.py in bigScatter(data, precomputed, logscale, features, colors, title, binsize, folder, showpoint)
    131         show(p)
    132     if folder:
--> 133       save(p, folder + title.replace(' ', "_") + "_scatter.html")
    134       #export_png(p, filename=folder + title.replace(' ', "_") + "_scatter.png")
    135 

~/.local/lib/python3.8/site-packages/bokeh/io/saving.py in save(obj, filename, resources, title, template, state, **kwargs)
     83 
     84     filename, resources, title = _get_save_args(state, filename, resources, title)
---> 85     _save_helper(obj, filename, resources, title, template)
     86     return abspath(filename)
     87 

~/.local/lib/python3.8/site-packages/bokeh/io/saving.py in _save_helper(obj, filename, resources, title, template)
    145     '''
    146     from ..embed import file_html
--> 147     html = file_html(obj, resources, title=title, template=template)
    148 
    149     with io.open(filename, mode="w", encoding="utf-8") as f:

~/.local/lib/python3.8/site-packages/bokeh/embed/standalone.py in file_html(models, resources, title, template, template_variables, theme, suppress_callback_warning, _always_new)
    301         models_seq = models
    302 
--> 303     with OutputDocumentFor(models_seq, apply_theme=theme, always_new=_always_new) as doc:
    304         (docs_json, render_items) = standalone_docs_json_and_render_items(models_seq, suppress_callback_warning=suppress_callback_warning)
    305         title = _title_from_models(models_seq, title)

/usr/lib/python3.8/contextlib.py in __enter__(self)
    111         del self.args, self.kwds, self.func
    112         try:
--> 113             return next(self.gen)
    114         except StopIteration:
    115             raise RuntimeError("generator didn't yield") from None

~/.local/lib/python3.8/site-packages/bokeh/embed/util.py in OutputDocumentFor(objs, apply_theme, always_new)
    132             doc = Document()
    133             for model in objs:
--> 134                 doc.add_root(model)
    135 
    136         # handle a single shared document

~/.local/lib/python3.8/site-packages/bokeh/document/document.py in add_root(self, model, setter)
    317             self._roots.append(model)
    318         finally:
--> 319             self._pop_all_models_freeze()
    320         self._trigger_on_change(RootAddedEvent(self, model, setter))
    321 

~/.local/lib/python3.8/site-packages/bokeh/document/document.py in _pop_all_models_freeze(self)
   1054         self._all_models_freeze_count -= 1
   1055         if self._all_models_freeze_count == 0:
-> 1056             self._recompute_all_models()
   1057 
   1058     def _recompute_all_models(self):

~/.local/lib/python3.8/site-packages/bokeh/document/document.py in _recompute_all_models(self)
   1077             d._detach_document()
   1078         for a in to_attach:
-> 1079             a._attach_document(self)
   1080         self._all_models = recomputed
   1081         self._all_models_by_name = recomputed_by_name

~/.local/lib/python3.8/site-packages/bokeh/model.py in _attach_document(self, doc)
    669         '''
    670         if self._document is not None and self._document is not doc:
--> 671             raise RuntimeError("Models must be owned by only a single document, %r is already in a doc" % (self))
    672         doc.theme.apply_to_model(self)
    673         self._document = doc

RuntimeError: Models must be owned by only a single document, Toolbar(id='1759', ...) is already in a doc
In [90]:
#Cluster the datapoints according to the Quality Threshold algorithm.
clusts = net.cluster(scaled_data, type='KMeans',numcl=n_clust[n])
sorting  = []
for clust in clusts:
    sorting.extend(clust)
sorting = np.array(sorting)
Warning: Only Quality Threshold and Density Peak clustering work with PBC
<Figure size 432x288 with 0 Axes>
In [91]:
viridis = cm.get_cmap('gist_rainbow', len(clusts))
colors=[]
for i, clust in enumerate(clusts):
    colors.extend([viridis(i)]*len(clust))
colors = np.array(colors)
viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[annot:]]
for val in data.columns:
    data[val] = np.log2(1+data[val])
data = (data-data.min())/(data.max()-data.min())
rand = np.random.choice(range(len(sorting)),5000)
rand.sort()
data = data.iloc[sorting[rand]]
for val in data.columns:
    a = [viridis(int(v*256)) for v in data[val]]
    data[val] = a
data["clusters"]  = [tuple(i) for i in colors[rand]]
[autoreload of JKBio.epigenetics.chipseq failed: Traceback (most recent call last):
  File "/home/jeremie/.local/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/jeremie/.local/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/usr/lib/python3.8/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/usr/lib/python3.8/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 779, in exec_module
  File "<frozen importlib._bootstrap_external>", line 916, in get_code
  File "<frozen importlib._bootstrap_external>", line 846, in source_to_code
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "../../JKBio/epigenetics/chipseq.py", line 920
    finalpeaks = finalpeaks[np.logical_or(tot>1,peakmatrix[biggest_ind])]
                                                                        ^
TabError: inconsistent use of tabs and spaces in indentation
]
<ipython-input-91-3d497b534963>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[val] = np.log2(1+data[val])
In [92]:
#sorted clustermap of cobindings clustered with SOM
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[cols:annot]].iloc[sorting[rand]].T), vmin=0,vmax=3,figsize=(20,15),z_score=0, colors_ratio=0.01, col_cluster=False,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("sorted clustermap of cobindings clustered with SOM")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_SOM_'+str(n_clust)+'_clustermap_cobinding.pdf')
plt.show()
In [93]:
#let's look at the enrichment over SOM clusters
groups = []
for i, val in enumerate(clusts):
    groups.extend([i]*len(val))
enr, pvals, _ = chip.enrichment(merged, groups = np.array(groups))
[autoreload of JKBio.epigenetics.chipseq failed: Traceback (most recent call last):
  File "/home/jeremie/.local/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/jeremie/.local/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/usr/lib/python3.8/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/usr/lib/python3.8/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 779, in exec_module
  File "<frozen importlib._bootstrap_external>", line 916, in get_code
  File "<frozen importlib._bootstrap_external>", line 846, in source_to_code
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "../../JKBio/epigenetics/chipseq.py", line 922
    finalpeaks = finalpeaks[np.logical_or(tot>1,peakmatrix[biggest_ind])]
                                                                        ^
TabError: inconsistent use of tabs and spaces in indentation
]
../../JKBio/epigenetics/chipseq.py:1146: RuntimeWarning: divide by zero encountered in log2
  enrichment[i, j] = np.log2(e)
In [95]:
fig = sns.clustermap(enr.T,figsize=(12,12), vmax=4, vmin=-4, cmap='RdBu_r')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'enrichment_on_SOMcluster_metasubset.pdf')
plt.show()

Plot TSNE density map

In [123]:
rand = np.random.choice(merged.index,30000)
In [124]:
red_data = TSNE(2,600,verbose=10,n_iter=1500).fit_transform(scaled_data[rand])
np.save('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'red_data.npy',red_data)
[t-SNE] Computing 1801 nearest neighbors...
[t-SNE] Indexed 30000 samples in 2.828s...
[t-SNE] Computed neighbors for 30000 samples in 129.902s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30000
[t-SNE] Computed conditional probabilities for sample 2000 / 30000
[t-SNE] Computed conditional probabilities for sample 3000 / 30000
[t-SNE] Computed conditional probabilities for sample 4000 / 30000
[t-SNE] Computed conditional probabilities for sample 5000 / 30000
[t-SNE] Computed conditional probabilities for sample 6000 / 30000
[t-SNE] Computed conditional probabilities for sample 7000 / 30000
[t-SNE] Computed conditional probabilities for sample 8000 / 30000
[t-SNE] Computed conditional probabilities for sample 9000 / 30000
[t-SNE] Computed conditional probabilities for sample 10000 / 30000
[t-SNE] Computed conditional probabilities for sample 11000 / 30000
[t-SNE] Computed conditional probabilities for sample 12000 / 30000
[t-SNE] Computed conditional probabilities for sample 13000 / 30000
[t-SNE] Computed conditional probabilities for sample 14000 / 30000
[t-SNE] Computed conditional probabilities for sample 15000 / 30000
[t-SNE] Computed conditional probabilities for sample 16000 / 30000
[t-SNE] Computed conditional probabilities for sample 17000 / 30000
[t-SNE] Computed conditional probabilities for sample 18000 / 30000
[t-SNE] Computed conditional probabilities for sample 19000 / 30000
[t-SNE] Computed conditional probabilities for sample 20000 / 30000
[t-SNE] Computed conditional probabilities for sample 21000 / 30000
[t-SNE] Computed conditional probabilities for sample 22000 / 30000
[t-SNE] Computed conditional probabilities for sample 23000 / 30000
[t-SNE] Computed conditional probabilities for sample 24000 / 30000
[t-SNE] Computed conditional probabilities for sample 25000 / 30000
[t-SNE] Computed conditional probabilities for sample 26000 / 30000
[t-SNE] Computed conditional probabilities for sample 27000 / 30000
[t-SNE] Computed conditional probabilities for sample 28000 / 30000
[t-SNE] Computed conditional probabilities for sample 29000 / 30000
[t-SNE] Computed conditional probabilities for sample 30000 / 30000
[t-SNE] Mean sigma: 0.000000
[t-SNE] Computed conditional probabilities in 47.288s
[t-SNE] Iteration 50: error = 69.6079483, gradient norm = 0.0000002 (50 iterations in 67.974s)
[t-SNE] Iteration 100: error = 69.4819565, gradient norm = 0.0008712 (50 iterations in 76.128s)
[t-SNE] Iteration 150: error = 67.5626373, gradient norm = 0.0002657 (50 iterations in 77.880s)
[t-SNE] Iteration 200: error = 67.4055862, gradient norm = 0.0000265 (50 iterations in 69.300s)
[t-SNE] Iteration 250: error = 67.3987503, gradient norm = 0.0000096 (50 iterations in 68.820s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 67.398750
[t-SNE] Iteration 300: error = 1.7217795, gradient norm = 0.0012071 (50 iterations in 68.830s)
[t-SNE] Iteration 350: error = 1.3091971, gradient norm = 0.0004930 (50 iterations in 70.872s)
[t-SNE] Iteration 450: error = 1.0747644, gradient norm = 0.0001760 (50 iterations in 68.644s)
[t-SNE] Iteration 500: error = 1.0283132, gradient norm = 0.0001205 (50 iterations in 70.856s)
[t-SNE] Iteration 550: error = 1.0000806, gradient norm = 0.0000886 (50 iterations in 73.080s)
[t-SNE] Iteration 600: error = 0.9805763, gradient norm = 0.0000665 (50 iterations in 73.820s)
[t-SNE] Iteration 650: error = 0.9670953, gradient norm = 0.0000509 (50 iterations in 74.417s)
[t-SNE] Iteration 700: error = 0.9577295, gradient norm = 0.0000420 (50 iterations in 75.134s)
[t-SNE] Iteration 750: error = 0.9508802, gradient norm = 0.0000347 (50 iterations in 83.492s)
[t-SNE] Iteration 800: error = 0.9458879, gradient norm = 0.0000281 (50 iterations in 81.067s)
[t-SNE] Iteration 850: error = 0.9421903, gradient norm = 0.0000240 (50 iterations in 78.957s)
[t-SNE] Iteration 900: error = 0.9393796, gradient norm = 0.0000207 (50 iterations in 70.077s)
[t-SNE] Iteration 950: error = 0.9371912, gradient norm = 0.0000184 (50 iterations in 68.379s)
[t-SNE] Iteration 1000: error = 0.9354650, gradient norm = 0.0000168 (50 iterations in 70.040s)
[t-SNE] Iteration 1050: error = 0.9340342, gradient norm = 0.0000165 (50 iterations in 71.140s)
[t-SNE] Iteration 1100: error = 0.9330671, gradient norm = 0.0000167 (50 iterations in 41.325s)
[t-SNE] Iteration 1150: error = 0.9322598, gradient norm = 0.0000160 (50 iterations in 33.687s)
[t-SNE] Iteration 1200: error = 0.9316489, gradient norm = 0.0000153 (50 iterations in 29.537s)
[t-SNE] Iteration 1250: error = 0.9312585, gradient norm = 0.0000118 (50 iterations in 29.071s)
[t-SNE] Iteration 1300: error = 0.9309396, gradient norm = 0.0000118 (50 iterations in 31.245s)
[t-SNE] Iteration 1350: error = 0.9307129, gradient norm = 0.0000101 (50 iterations in 33.305s)
[t-SNE] Iteration 1400: error = 0.9304866, gradient norm = 0.0000119 (50 iterations in 35.358s)
[t-SNE] Iteration 1450: error = 0.9302984, gradient norm = 0.0000110 (50 iterations in 38.106s)
[t-SNE] Iteration 1500: error = 0.9301001, gradient norm = 0.0000152 (50 iterations in 38.851s)
[t-SNE] KL divergence after 1500 iterations: 0.930100
In [125]:
# Let's look at the data. the density map of the distribution of all pseudo enhancer over their tf binding profile
_, ax = plt.subplots(figsize=(10,10))
fig = sns.kdeplot(red_data[:,0], red_data[:,1], shade=True, ax=ax)
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_density_TSNE.pdf')
In [126]:
# now let's look at it from anothe rploting method. we can see many many islands
plot.bigScatter(red_data,binsize=0.4,showpoint=False,precomputed=False, logscale=True, title='density plot of enhancers in TF cobinding space with TSNE', folder='../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+"_")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:125: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:138: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-126-50c49b92d056> in <module>
      1 # now let's look at it from anothe rploting method. we can see many many islands
----> 2 plot.bigScatter(red_data,binsize=0.4,showpoint=False,precomputed=False, logscale=True, title='density plot of enhancers in TF cobinding space with TSNE', folder='../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+"_")

~/JKBio/utils/plot.py in bigScatter(data, precomputed, logscale, features, title, binsize, folder, showpoint)
    157         show(p)
    158     if folder:
--> 159       save(p, folder + title.replace(' ', "_") + "_scatter.html")
    160       #export_png(p, filename=folder + title.replace(' ', "_") + "_scatter.png")
    161 

~/.local/lib/python3.8/site-packages/bokeh/io/saving.py in save(obj, filename, resources, title, template, state, **kwargs)
     83 
     84     filename, resources, title = _get_save_args(state, filename, resources, title)
---> 85     _save_helper(obj, filename, resources, title, template)
     86     return abspath(filename)
     87 

~/.local/lib/python3.8/site-packages/bokeh/io/saving.py in _save_helper(obj, filename, resources, title, template)
    145     '''
    146     from ..embed import file_html
--> 147     html = file_html(obj, resources, title=title, template=template)
    148 
    149     with io.open(filename, mode="w", encoding="utf-8") as f:

~/.local/lib/python3.8/site-packages/bokeh/embed/standalone.py in file_html(models, resources, title, template, template_variables, theme, suppress_callback_warning, _always_new)
    301         models_seq = models
    302 
--> 303     with OutputDocumentFor(models_seq, apply_theme=theme, always_new=_always_new) as doc:
    304         (docs_json, render_items) = standalone_docs_json_and_render_items(models_seq, suppress_callback_warning=suppress_callback_warning)
    305         title = _title_from_models(models_seq, title)

/usr/lib/python3.8/contextlib.py in __enter__(self)
    111         del self.args, self.kwds, self.func
    112         try:
--> 113             return next(self.gen)
    114         except StopIteration:
    115             raise RuntimeError("generator didn't yield") from None

~/.local/lib/python3.8/site-packages/bokeh/embed/util.py in OutputDocumentFor(objs, apply_theme, always_new)
    132             doc = Document()
    133             for model in objs:
--> 134                 doc.add_root(model)
    135 
    136         # handle a single shared document

~/.local/lib/python3.8/site-packages/bokeh/document/document.py in add_root(self, model, setter)
    317             self._roots.append(model)
    318         finally:
--> 319             self._pop_all_models_freeze()
    320         self._trigger_on_change(RootAddedEvent(self, model, setter))
    321 

~/.local/lib/python3.8/site-packages/bokeh/document/document.py in _pop_all_models_freeze(self)
   1054         self._all_models_freeze_count -= 1
   1055         if self._all_models_freeze_count == 0:
-> 1056             self._recompute_all_models()
   1057 
   1058     def _recompute_all_models(self):

~/.local/lib/python3.8/site-packages/bokeh/document/document.py in _recompute_all_models(self)
   1077             d._detach_document()
   1078         for a in to_attach:
-> 1079             a._attach_document(self)
   1080         self._all_models = recomputed
   1081         self._all_models_by_name = recomputed_by_name

~/.local/lib/python3.8/site-packages/bokeh/model.py in _attach_document(self, doc)
    669         '''
    670         if self._document is not None and self._document is not doc:
--> 671             raise RuntimeError("Models must be owned by only a single document, %r is already in a doc" % (self))
    672         doc.theme.apply_to_model(self)
    673         self._document = doc

RuntimeError: Models must be owned by only a single document, BoxAnnotation(id='1924', ...) is already in a doc
In [ ]:
## TODO: color by dependencies/pathways/expression.. after ABC model,